Загрузка данных, обзывание

data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", header=TRUE, sep=",")
colnames(data) <- c("sex", "length", "diameter", "height", 
                    "whole_weight", "shucked_weight",
                    "viscera_weight", "shell_weight", "rings")
data$sex <- factor(c("Female", "Infant", "Male")[data$sex])
summary(data)
##      sex           length         diameter          height      
##  Female:1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  Infant:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  Male  :1527   Median :0.545   Median :0.4250   Median :0.1400  
##                Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##                3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##                Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##   whole_weight    shucked_weight   viscera_weight     shell_weight   
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.00050   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.09337   1st Qu.:0.1300  
##  Median :0.7997   Median :0.3360   Median :0.17100   Median :0.2340  
##  Mean   :0.8288   Mean   :0.3594   Mean   :0.18061   Mean   :0.2389  
##  3rd Qu.:1.1533   3rd Qu.:0.5020   3rd Qu.:0.25300   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.76000   Max.   :1.0050  
##      rings       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.932  
##  3rd Qu.:11.000  
##  Max.   :29.000

Возможно интересные колонки

hist(data$diameter, main = "Диаметр, мм")

hist(data$height, main = "Высота, мм")

hist(data$whole_weight, main = "Полный вес, гр")

hist(data$rings, main = "Количество колец на ракушке, шт.")

Линейные модели по изначальным данным

Вес от диаметра

lm.d.1 <- lm(whole_weight ~ diameter, data=data)
summary(lm.d.1)
## 
## Call:
## lm(formula = whole_weight ~ diameter, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56747 -0.12310 -0.03997  0.07211  1.14104 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.03645    0.01216   -85.2   <2e-16 ***
## diameter     4.57295    0.02898   157.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1858 on 4174 degrees of freedom
## Multiple R-squared:  0.8565, Adjusted R-squared:  0.8564 
## F-statistic: 2.491e+04 on 1 and 4174 DF,  p-value: < 2.2e-16
plot(lm.d.1)

Вес от высоты

lm.h.1 <- lm(whole_weight ~ height, data=data)
summary(lm.h.1)
## 
## Call:
## lm(formula = whole_weight ~ height, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7487 -0.1488 -0.0346  0.1151  1.5238 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.51140    0.01516  -33.73   <2e-16 ***
## height       9.60540    0.10408   92.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2813 on 4174 degrees of freedom
## Multiple R-squared:  0.6711, Adjusted R-squared:  0.671 
## F-statistic:  8517 on 1 and 4174 DF,  p-value: < 2.2e-16
plot(lm.h.1)

Количество колец от веса

lm.r.1 <- lm(rings ~ whole_weight, data=data)
summary(lm.r.1)
## 
## Call:
## lm(formula = rings ~ whole_weight, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2707 -1.7504 -0.6856  1.0189 15.7036 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.98615    0.08241   84.77   <2e-16 ***
## whole_weight  3.55485    0.08558   41.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.712 on 4174 degrees of freedom
## Multiple R-squared:  0.2925, Adjusted R-squared:  0.2923 
## F-statistic:  1726 on 1 and 4174 DF,  p-value: < 2.2e-16
plot(lm.r.1)

Избавление от выбросов

data.noout <- data[complete.cases(data),]
data.noout <- data.noout[data.noout$diameter > .1 & data.noout$diameter < .6,]
data.noout <- data.noout[data.noout$height < .2 & data.noout$height > 0.05,]
data.noout <- data.noout[data.noout$whole_weight < 1.8,]
data.noout <- data.noout[data.noout$rings >= 5 & data.noout$rings < 20,]
hist(data.noout$diameter, main = "Диаметр, мм")

hist(data.noout$height, main = "Высота, мм")

hist(data.noout$whole_weight, main = "Полный вес, гр")

hist(data$rings, main = "Количество колец на ракушке, шт.")

Линейные модели по очищеным данным

Вес от диаметра

lm.d.2 <- lm(whole_weight ~ diameter, data=data.noout)
summary(lm.d.2)
## 
## Call:
## lm(formula = whole_weight ~ diameter, data = data.noout)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52430 -0.09620 -0.02237  0.07098  0.91644 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.01201    0.01079   -93.8   <2e-16 ***
## diameter     4.42755    0.02609   169.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1409 on 3759 degrees of freedom
## Multiple R-squared:  0.8845, Adjusted R-squared:  0.8845 
## F-statistic: 2.88e+04 on 1 and 3759 DF,  p-value: < 2.2e-16
plot(lm.d.2)

Вес от высоты

lm.h.2 <- lm(whole_weight ~ height, data=data.noout)
summary(lm.h.2)
## 
## Call:
## lm(formula = whole_weight ~ height, data = data.noout)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82800 -0.11492 -0.00633  0.10831  0.91220 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.76602    0.01344  -56.99   <2e-16 ***
## height      11.30566    0.09580  118.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1911 on 3759 degrees of freedom
## Multiple R-squared:  0.7875, Adjusted R-squared:  0.7874 
## F-statistic: 1.393e+04 on 1 and 3759 DF,  p-value: < 2.2e-16
plot(lm.h.2)

Кол-во колец от веса

lm.r.2 <- lm(rings ~ whole_weight, data=data.noout)
summary(lm.r.2)
## 
## Call:
## lm(formula = rings ~ whole_weight, data = data.noout)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7179 -1.6179 -0.6223  0.9662 10.5039 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.10982    0.08160   87.14   <2e-16 ***
## whole_weight  3.36476    0.09266   36.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.355 on 3759 degrees of freedom
## Multiple R-squared:  0.2597, Adjusted R-squared:  0.2595 
## F-statistic:  1319 on 1 and 3759 DF,  p-value: < 2.2e-16
plot(lm.r.2)

Делим пополам

odds <- seq(1, nrow(data.noout), by=2)
data.in <- data.noout[odds,]
data.out <- data.noout[-odds,]

Предсказание веса по диаметру

lm.d.half <- lm(whole_weight ~ diameter, data=data.in)
summary(lm.d.half)
## 
## Call:
## lm(formula = whole_weight ~ diameter, data = data.in)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48611 -0.09479 -0.01969  0.06878  0.91792 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.01689    0.01518  -67.01   <2e-16 ***
## diameter     4.43565    0.03669  120.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1395 on 1879 degrees of freedom
## Multiple R-squared:  0.8861, Adjusted R-squared:  0.886 
## F-statistic: 1.462e+04 on 1 and 1879 DF,  p-value: < 2.2e-16
data.d.pred <- predict(lm.d.half)
cor(data.in$whole_weight, data.d.pred)
## [1] 0.9413183
plot(data.in$whole_weight, data.d.pred)

data.d.pred.out <- predict(lm.d.half, data.out)
cor(data.out$whole_weight, data.d.pred.out)
## [1] 0.9397162
plot(data.out$whole_weight, data.d.pred.out)

Предсказание веса по высоте

lm.h.half <- lm(whole_weight ~ height, data=data.in)
summary(lm.h.half)
## 
## Call:
## lm(formula = whole_weight ~ height, data = data.in)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83191 -0.11657 -0.00807  0.10455  0.91126 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.77739    0.01947  -39.94   <2e-16 ***
## height      11.39055    0.13894   81.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1932 on 1879 degrees of freedom
## Multiple R-squared:  0.7815, Adjusted R-squared:  0.7814 
## F-statistic:  6721 on 1 and 1879 DF,  p-value: < 2.2e-16
data.h.pred <- predict(lm.h.half)
cor(data.in$whole_weight, data.h.pred)
## [1] 0.8840352
plot(data.in$whole_weight, data.h.pred)

data.h.pred.out <- predict(lm.h.half, data.out)
cor(data.out$whole_weight, data.h.pred.out)
## [1] 0.8907289
plot(data.out$whole_weight, data.h.pred.out)

Предсказание кол-ва колец по весу

lm.r.half <- lm(rings ~ whole_weight, data=data.in)
summary(lm.r.half)
## 
## Call:
## lm(formula = rings ~ whole_weight, data = data.in)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7233 -1.6340 -0.6424  1.0293  9.6802 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.1402     0.1157   61.70   <2e-16 ***
## whole_weight   3.3466     0.1316   25.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.359 on 1879 degrees of freedom
## Multiple R-squared:  0.256,  Adjusted R-squared:  0.2556 
## F-statistic: 646.6 on 1 and 1879 DF,  p-value: < 2.2e-16
data.r.pred <- predict(lm.r.half)
cor(data.in$rings, data.r.pred)
## [1] 0.5059697
plot(data.in$rings, data.r.pred)

data.r.pred.out <- predict(lm.r.half, data.out)
cor(data.out$rings, data.r.pred.out)
## [1] 0.5132176
plot(data.out$rings, data.r.pred.out)